/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2003 - 2004 by Myricom, Inc.  All rights reserved.          *
 *************************************************************************/

static const char __idstring[] = "@(#)$Id: mx.c,v 1.51 2006/12/13 22:55:21 reese Exp $";
#include "mx_arch.h"
#include "mx_instance.h"
#include "mx_malloc.h"
#include "mx_misc.h"
#include "mx_peer.h"
#include "mx_stbar.h"
#include "mx_version.h"
#include <sys/errno.h>
#include <sys/random.h>
#include <sys/utsname.h>
#include <sys/systm.h>
#include <sys/pci.h>

static int mx_nvidia_ecrc_enable = 1;
#define ENABLE_NVIDIA_ECRC  1

int64_t getpid(void);

#define MX_UNTRANSLATED_SYSTEM_ERROR 1001   /* correct them later */
#define MX_INTERNAL_ERROR            1002

typedef struct mx_minor_state 
{
  mx_instance_state_t *is;
  mx_endpt_state_t *es;
  uint8_t privileged;
} mx_minor_state_t;

static kmutex_t mx_minor_mutex;
int mx_max_minor = 0;
mx_minor_state_t *mx_minors = 0;
int mx_minor_alloc(mx_instance_state_t *is, int *minor);
void mx_minor_free(int minor);

/* initialized in _init. */
kmutex_t mx_driver_mutex;

static int mx_open (dev_t * devp, int flags, int otyp,
		    cred_t * credp);
static int mx_close (dev_t dev, int flags, int otyp, cred_t * credp);
static int mx_devmap (dev_t dev, devmap_cookie_t dhp, offset_t off, 
		      size_t len, size_t *maplen, uint_t model);
static int mx_ioctl (dev_t dev, int cmd, intptr_t arg,
		     int mode, cred_t * cred_p, int *rval_p);
static int mx_read(dev_t dev, struct uio *uiop, cred_t *credp);
static int mx_identify (dev_info_t * dip);
static int mx_attach (dev_info_t * dip, ddi_attach_cmd_t cmd);
static int mx_detach (dev_info_t * dip, ddi_detach_cmd_t cmd);
static int mx_getinfo (dev_info_t * dip, ddi_info_cmd_t infocmd,
		       void *arg, void **result);

#define MX_DRIVER_VERSION       "Myrinet MX Driver "
static char mx_driver_version[40] = {0};

/********************************************/
/* Loadable module required data structures */
/********************************************/

static struct cb_ops mx_cb_ops = {
  mx_open,			/* open */
  mx_close,			/* close */
  nodev,			/* not a block driver */
  nodev,			/* no print routine */
  nodev,			/* no dump routine */
  mx_read,			/* no read routine */
  nodev,			/* no write routine */
  mx_ioctl,			/* ioctl */
  mx_devmap,			/* devmap routine */
  nodev,			/* no mmap */
  nodev,			/* no segmap */
  nochpoll,			/* no chpoll routine */
  ddi_prop_op,			/* prop_op -- using default */
  NULL,				/* cb_str not a STREAMS driver */
#if MX_SIZEOF_UP_T == 8
  D_64BIT |			/* device handles 64-bit offsets */
#endif
  D_DEVMAP | D_NEW | D_MP,	/* safe for multi-thread/multi-processor */
  CB_REV,			/* revision */
  nodev,			/* no aread routine */
  nodev				/* no awrite routine */
};

static struct dev_ops mx_ops = {
  DEVO_REV,			/* DEVO_REV indicated by manual  */
  0,				/* device reference count */
  mx_getinfo,			/* getinfo */
  mx_identify,			/* identify (contrary to docs, IS needed) */
  nulldev,			/* probe (nulldev self-identifying devices) */
  mx_attach,			/* attach */
  mx_detach,			/* detatch */
  nodev,			/* device reset routine -- set to nodev */
  &mx_cb_ops,
  (struct bus_ops *) NULL,	/* bus operations */
};

static struct modldrv modldrv = {
  &mod_driverops,
  mx_driver_version,
  &mx_ops,
};


static struct modlinkage modlinkage = {
  MODREV_1,
  {&modldrv, NULL},
};

/*****************/
/* Other globals */
/*****************/

void *mx_instancep;		/* For per-device structures */

ddi_dma_attr_t mx_dma_attr = {
  DMA_ATTR_V0,			/* version */
  0x0,				/* addr_lo */
  (uint64_t)0xFFFFFFFFFFFFFFFFULL,	/* addr_hi */
  0x7FFFFFFF,			/* count_max */
  0 /* set before used */ ,	/* align (for DMA resources) */
  0x7F,				/* burstsizes */
  1,				/* minxfer */
  0x7FFFFFFF,			/* maxxfer */
  0x7FFFFFFF,			/* seg */
  0x7FFFFFFF,			/* sgllen */
  0, /* set before used */	/* dma granularity */
  0				/* flags -- must be 0 */
};

#if defined sparc64 || defined __sparcv9
#define       WC 0
#else
#define       WC 1
#endif

struct ddi_device_acc_attr mx_dev_access_attr = {
  DDI_DEVICE_ATTR_V0,		/* version */
  DDI_NEVERSWAP_ACC,		/* endian flash */
#if WC
  DDI_MERGING_OK_ACC
#else
  DDI_STRICTORDER_ACC		/* data order */
#endif
};

unsigned int *mx_instance_ok;   /* array of size mx_max_instance */
                                /* initialized in _init */

/*********************************************
 * PCI configuration space access functions. *
 *********************************************/
int
mx_read_pci_config_32 (mx_instance_state_t * is, 
		       uint32_t offset, uint32_t * value)
{
  *value = pci_config_get32 (is->arch.pci_acc_handle, offset);
  return 0;
}

int
mx_write_pci_config_32 (mx_instance_state_t * is, 
			uint32_t offset, uint32_t value)
{
  pci_config_put32 (is->arch.pci_acc_handle, offset, value);
  return 0;
}

int
mx_read_pci_config_16 (mx_instance_state_t * is, 
		       uint32_t offset, uint16_t * value)
{
  *value = pci_config_get16 (is->arch.pci_acc_handle, offset);
  return 0;
}

int
mx_write_pci_config_16 (mx_instance_state_t * is, 
			uint32_t offset, uint16_t value)
{
  pci_config_put16 (is->arch.pci_acc_handle, offset, value);
  return 0;
}

int
mx_read_pci_config_8 (mx_instance_state_t * is, 
		      uint32_t offset, uint8_t * value)
{
  *value = pci_config_get8 (is->arch.pci_acc_handle, offset);
  return 0;
}

int
mx_write_pci_config_8 (mx_instance_state_t * is,
		       uint32_t offset, uint8_t value)
{
  pci_config_put8 (is->arch.pci_acc_handle, offset, value);
  return 0;
}


/********************************
 * MX synchronization functions *
 ********************************/

void
mx_spin (uint32_t usecs)
{
#if MX_SOLARIS_COMPLICATED_SPIN
  long timeout_time;  
  drv_getparm (LBOLT, &timeout_time);
  timeout_time += drv_usectohz (usecs);

  mutex_enter (&is->arch.spin_sync._mu);
  cv_timedwait_sig (&is->arch.spin_sync.cv, &is->arch.spin_sync._mu,
                    timeout_time);
  mutex_exit (&is->arch.spin_sync._mu);
#else
  drv_usecwait (usecs);
#endif
}

/****************************************************************
 * Synchronization functions
 ****************************************************************/

/* For solaris, we must be careful to pass an appropriate iblock
   cookie to mx_mutex_init if the mutex is ever user by the interrupt
   handler. */

void
mx_sync_init (mx_sync_t *s, mx_instance_state_t *is, int unique, char *str)
{
  bzero (s, sizeof(*s));
  mutex_init (&s->mu, "MX mutex", MUTEX_DRIVER, 0);
  mutex_init (&s->_mu, "MX condvar mutex", MUTEX_DRIVER, 0);
  cv_init (&s->cv, "MX condvar", CV_DRIVER, NULL);
  s->tag = MX_ARCH_SYNC_TAG;
}

void
mx_sync_destroy(mx_sync_t *s)
{
  cv_destroy (&s->cv);
  mutex_destroy (&s->_mu);
  mutex_destroy (&s->mu);
  bzero (s, sizeof(*s));
}

void
mx_sync_reset (mx_sync_t *s)
{
  s->wake_cnt = 0;
}


/* "spinlocks" are used by the interrupt handler */
void
mx_spin_lock_init(mx_spinlock_t *s, mx_instance_state_t *is, 
		  int endpoint, char *string)
{
  ddi_iblock_cookie_t mutex_cookie;

  /* hack, at least one spinlock is shared among boards, and
   is called with is==NULL*/
  if (is != NULL)
    mutex_cookie = is->arch.iblock_cookie;
  else
    mutex_cookie = 0;

  mutex_init(s, string, MUTEX_DRIVER, mutex_cookie);
}

void
mx_spin_lock_destroy(mx_spinlock_t *s)
{
  mutex_destroy(s);
}


/*****************************************************************
 * Sleep functions
 *****************************************************************/

/* The interrupt handler atomically increments WAKE_CNT each time 
   a wake interrupt is received and the user threads decrementing 
   WAKE_CNT each time they claim a wake interrupt.   */

/* Wake the thread sleeping on the synchronization variable. */

void
mx_wake(mx_sync_t * s)
{
  MX_DEBUG_PRINT (MX_DEBUG_SLEEP, ("mx_wake() called.\n"));

  mutex_enter (&s->_mu);
  MX_DEBUG_PRINT (MX_DEBUG_SLEEP, ("mx_wake() entered mutex.\n"));
  s->wake_cnt++;
  cv_signal (&s->cv);
  mutex_exit (&s->_mu);

  MX_DEBUG_PRINT (MX_DEBUG_SLEEP, ("mx_wake() returning.\n"));
}

/* sleep until awakened (0), timeout (EAGAIN) or signal (EINTR) */

int
mx_sleep (mx_sync_t *s, int ms, int flags)
{
  long timeout_time;
  int ret;

  MX_DEBUG_PRINT (MX_DEBUG_SLEEP, ("mx_sleep() called.\n"));

  drv_getparm(LBOLT, &timeout_time);
  timeout_time += drv_usectohz(ms * 1000);

  mutex_enter(&s->_mu);
  MX_DEBUG_PRINT (MX_DEBUG_SLEEP, ("mx_sleep() entered mutex.\n"));
  
  /*
   * cv_* returns:
   *  -1 = timed out
   *   0 = kill()-style signal received
   *  >0 = condition *may* have been signalled
   */
  ret = 1;	/* initialize to > 0 for loop */
  while (ret > 0 && s->wake_cnt <= 0) {
    if (flags & MX_SLEEP_INTR) {
      if (ms != MX_MAX_WAIT) {
	ret = cv_timedwait_sig(&s->cv, &s->_mu, timeout_time);
      } else {
	ret = cv_wait_sig(&s->cv, &s->_mu);
      }
    }
    else {
      if (ms != MX_MAX_WAIT) {
	ret = cv_timedwait(&s->cv, &s->_mu, timeout_time);
      } else {
	cv_wait(&s->cv, &s->_mu);
      }
    }
  }

  /* If we got a successful wake, consume it */
  if (ret > 0) {
    --s->wake_cnt;
  }
   
  mutex_exit(&s->_mu);

  MX_DEBUG_PRINT (MX_DEBUG_SLEEP, ("mx_sleep() returning.\n"));

  if (ret < 0) {
    return EAGAIN;
  } else if (ret == 0) {
    return EINTR;
  } else {
    return 0;
  }
}

static void
mx_solaris_intr(void *arg)
{
  mx_instance_state_t *is = arg;
  mx_common_interrupt(is);
}

/****************************
 * Kernel Memory Allocation *
 ****************************/

/* malloc() analogue for the driver */

void *
mx_kmalloc(size_t len, uint32_t flags)
{
  uint32_t *p;

  /* Allocate extra space at beginning to store length for kmem_free call, and
     return an 8-byte aligned buffer. */
  if (flags & MX_MZERO)
    p = (uint32_t *) kmem_zalloc(8 + len, KM_NOSLEEP);
  else
    p = (uint32_t *) kmem_alloc (8 + len, KM_NOSLEEP);

  if (!p)
    return p;

  p[0] = 8 + len;

  return &p[2];
}

/* free() analogue for the driver */

void
mx_kfree(void *ptr)
{
  uint32_t *p;

  p = ptr;
  kmem_free (&p[-2], p[-2]);
}

/****************
 * Page locking *
 ****************/
int mx_iodone(struct buf *b)
{
  return 0;
}

/*
 * On newer versions of solaris, the per-project memory limits are
 * quite low (32MB by default).  They can be raised by using prctl:
 *   prctl -n project.max-device-locked-memory -v 300MB -r -i project 3
 * Project numbers are defined in /etc/project and they are:
 *   system:0
 *   user.root:1
 *   noproject:2
 *   default:3
 *   group.staff:10
 */


/*
 * This routine checks to see if a umem cookie is the one that
 * mx_umem_callback() is looking for.
 */

int
mx_umem_cookie_test(mx_page_pin_t *pin, void *arg)
{
  ddi_umem_cookie_t *cookie = (ddi_umem_cookie_t *)arg;

  if (pin == NULL)
    return 0;

  if (pin->umem_cookie == cookie)
    return 1;

  return 0;
}


/*
 * This routine is called when pinned memory is freed by an
 * application, either explicitly, or as part of an unclean exit.
 * A typical stack trace for this routine is:
 *             mx_driver`mx_umem_callback
 *             genunix`umem_lock_undo
 *             genunix`as_execute_callback
 *             genunix`as_do_callbacks
 *             genunix`as_free
 *             genunix`relvm
 *             genunix`proc_exit
 *             genunix`exit
 * 
 */

static void 
mx_umem_callback(ddi_umem_cookie_t *cookie)
{
  mx_endpt_state_t *es;
  mx_page_pin_t *pin;
  int minor;

  mutex_enter(&mx_driver_mutex);
  mutex_enter(&mx_minor_mutex);
  for (pin = NULL, minor = 0; (pin == NULL) && (minor < mx_max_minor); 
       minor++) {
    es = mx_minors[minor].es;
    if (es == NULL)
      continue;

    mx_mutex_enter(&es->sync);
    pin = mx_find_pin(es, mx_umem_cookie_test, (void *)cookie);
    mx_mutex_exit(&es->sync);
    if (pin != NULL) {
      MX_INFO(("closing es %p for pid=%d due to umem callback\n", 
	       es, es->opener.pid));
      mx_common_close(es);
      mx_minors[minor].es = NULL;
      mx_kfree(es);
    }
  }
  mutex_exit(&mx_minor_mutex);
  mutex_exit(&mx_driver_mutex);
  if (pin == NULL)
    MX_DEBUG_PRINT(MX_DEBUG_KVA_TO_PHYS,
		   ("mx_umem_callback called for no reason??\n"));	
}  

static struct umem_callback_ops mx_umem_callback_ops = {
  UMEM_CALLBACK_VERSION,
  mx_umem_callback
};

int
mx_pin_vpages(mx_instance_state_t *is, mx_page_pin_t *pins, 
              mcp_dma_addr_t *mdesc, int nvpages,
              int flags, uint64_t memory_context)
{
  struct as *as;
  ddi_dma_attr_t dma_attr;
  ddi_dma_cookie_t cookie;
  uintptr_t va;
  long num_pages;
  size_t dma_len;
  int len, status, ddi_flags, i;
  uint32_t dma_offset;
  uint_t cnt;


  len = MX_VPTOA(nvpages);
  va = (uintptr_t)pins[0].va;

  /* setup the dma attributes, and determine how many pages
     need to be DMA'ed */

  num_pages = MX_ATOP(MX_TRUNC_PAGE(va + len + PAGE_SIZE - 1)) -  MX_ATOP(va);
  bcopy(&mx_dma_attr, &dma_attr, sizeof (dma_attr));
  dma_attr.dma_attr_align = PAGE_SIZE;
  dma_attr.dma_attr_sgllen = num_pages;

  /* allocate a dma handle */
  status = ddi_dma_alloc_handle(is->arch.dip, &mx_dma_attr,
				DDI_DMA_DONTWAIT, 0, &pins[0].dma_handle);
  
  if (status != DDI_SUCCESS) {
    MX_NOTE(("Could not allocate dma handle.\n"));
    goto abort_with_nothing;
   }	

  if (flags & MX_PIN_PHYSICAL) {
    status = EOPNOTSUPP;
    goto abort_with_handle;
  }

  if (flags & MX_PIN_KERNEL) {
    /* kernel address space is always pinned, set as to null
       for call to ddi_dma_addr_bind_handle() below */
    as = NULL;    
  } else {
    /* lock down the user virtual memory addressed by va */
    as = curproc->p_as;
    status = umem_lockmemory((caddr_t)MX_TRUNC_PAGE(va), MX_PTOA(num_pages),
			     DDI_UMEMLOCK_LONGTERM | DDI_UMEMLOCK_READ 
			     | DDI_UMEMLOCK_WRITE, &pins->umem_cookie, 
			     &mx_umem_callback_ops, curproc);
    if (status != 0) {
      MX_NOTE(("Could not pin %ld vpages at %p.\n", num_pages, (void *)va));
      goto abort_with_handle;
    }
  }

  /* Translate DMA related flags to Solaris DMA flags */
  if (flags & MX_PIN_CONSISTENT)
    ddi_flags = DDI_DMA_RDWR | DDI_DMA_CONSISTENT;
  else
    ddi_flags = DDI_DMA_RDWR | DDI_DMA_STREAMING;


  /* Map the page(s) for DMA. */

  status = ddi_dma_addr_bind_handle(pins[0].dma_handle, as, 
				    (caddr_t)MX_TRUNC_PAGE(va), 
				    MX_PTOA(num_pages),
				    ddi_flags, DDI_DMA_DONTWAIT,
				    0, &cookie, &cnt);
  if (status != DDI_SUCCESS) {
    MX_WARN(("ddi_dma_addr_bind_handle fails for va = %p/%p, len = %d/%d",
	     (caddr_t)MX_TRUNC_PAGE(va), (void *)va,
	     (int)MX_PTOA(num_pages), (int)MX_PTOA(nvpages)));
    goto abort_with_mem;
  }

  /* extract the cookies into the supplied DMA descriptors.  Note that
     on some arches (sparcv9) the page size is larger than the MX
     vpage size, which makes things a bit tricky.  Even on x86, we
     sometimes get 2 contiguous pages which Solaris will fold into the
     same DMA cookie.  

     We make sure to take into account that the dma address starts on
     the page boundary, and the desired address starts on a vpage
     boundary, so we may be starting at some offset into the page
    */

  i = 0;
  dma_offset = va - MX_TRUNC_PAGE(va);
  dma_len = cookie.dmac_size - dma_offset;
  while(1) {
    while (dma_len > 0 && i < nvpages) {
      if (i + 1 < nvpages)
	mdesc[i+1].low = MX_DMA_INVALID_ENTRY;
      MX_STBAR();
      mdesc[i].low =
	htonl(MX_LOWPART_TO_U32(cookie.dmac_laddress + dma_offset));
      mdesc[i].high =
	htonl(MX_HIGHPART_TO_U32(cookie.dmac_laddress));
      i++;
      dma_offset += MX_VPAGE_SIZE;
      dma_len -= MX_VPAGE_SIZE;
    }
    if (i == nvpages)
      break;
    ddi_dma_nextcookie(pins[0].dma_handle, &cookie);
    dma_offset = 0;
    dma_len = cookie.dmac_size;
  }

  return 0;

 abort_with_mem:
  if ((flags & MX_PIN_KERNEL) == 0)
    ddi_umem_unlock (pins[0].umem_cookie);
 abort_with_handle:
  ddi_dma_free_handle(&pins[0].dma_handle);
 abort_with_nothing:
  pins[0].dma_handle = NULL;
  return status;
}	


void
mx_unpin_vpages(mx_instance_state_t *is, mx_page_pin_t *pin, int count, int flags)
{
  ddi_dma_unbind_handle(pin->dma_handle);
  ddi_dma_free_handle(&pin->dma_handle);
  pin->dma_handle = NULL;
  if ((flags & MX_PIN_KERNEL) == 0) {
    ddi_umem_unlock (pin->umem_cookie);
  }
  return;
}


int
mx_pin_page(mx_instance_state_t *is, mx_page_pin_t *pin, int flags, uint64_t memory_context)
{
  return mx_pin_vpages(is, pin, &pin->dma, 1, flags, memory_context);
}

void
mx_unpin_page(mx_instance_state_t *is, mx_page_pin_t *pin, int flags)
{
  mx_unpin_vpages(is, pin, 1, flags);
}

/* Get data from another process user space.
 * Assumes the amount of data to get is low, making it possible to be mapped
 * at once. Only used to get source segment list in direct_getv for now.
 */
int
mx_arch_copy_from_proc(mx_uaddr_t usrc, struct proc * proc, uint32_t pid,
		       void * dst,
		       uint32_t length)
{
  struct buf *bp;
  ddi_umem_cookie_t cookie;
  size_t maplen;
  uintptr_t start, offset;
  int err = 0;

  /* umem_lockmemory() requires a page aligned address, and a
     length which is a multiple of PAGE_SIZE*/
    
  start = MX_TRUNC_PAGE((uintptr_t)usrc);
  offset =  ((uintptr_t)usrc - start);
  maplen = MX_TRUNC_PAGE(offset + length + PAGE_SIZE - 1);

  /* map the src memory into the kernel */
  err =  umem_lockmemory((void *)start, maplen, DDI_UMEMLOCK_READ,
			 &cookie, NULL, proc);

  if (err) {
    MX_WARN(("mx_arch_copy_from_proc: unable to lock pid %d's address 0x%lx, err = %d\n",
 	     pid, (long)(uintptr_t)usrc, err));
    return err;
  }

  /* create a struct buf associated with this address range */
  bp = ddi_umem_iosetup(cookie, 0, maplen, B_READ, NODEV, 0,
			mx_iodone, DDI_UMEM_NOSLEEP);
  if (bp == NULL) {
    MX_WARN(("mx_arch_copy_from_proc: ddi_umem_iosetup returns NULL\n"));
    ddi_umem_unlock(cookie);
    return ENOMEM;
  }

  /* map it into the kernel address space */
  bp_mapin(bp);

  /* copy it to the destination */
  bcopy(bp->b_un.b_addr + offset, dst, length);

  /* free resources */
  bp_mapout(bp);
  freerbuf(bp);
  ddi_umem_unlock(cookie);

  return err;
}

int mx_direct_get_chunksize=2*1024*1024;

struct direct_get_callback_param {
  uint32_t pid;
  struct proc * proc;
};

/* OS specific callback for direct get, copying from another process
 * user-space to current process user-space.
 */

int
mx_arch_copy_user_to_user(mx_uaddr_t udst,
			  mx_uaddr_t usrc, void * src_space,
			  uint32_t len)
{
  struct direct_get_callback_param * param = (struct direct_get_callback_param *) src_space;
  struct buf *bp;
  ddi_umem_cookie_t cookie;
  size_t copysize, maplen;
  uintptr_t start, offset;
  int err = 0;

  while (len != 0) {
    
    /* calculate copy size */
    if (mx_direct_get_chunksize < len)
      copysize = mx_direct_get_chunksize;
    else
      copysize = len;

    /* umem_lockmemory() requires a page aligned address, and a
       length which is a multiple of PAGE_SIZE*/
    
    start = MX_TRUNC_PAGE((uintptr_t)usrc);
    offset =  ((uintptr_t)usrc - start);
    maplen = MX_TRUNC_PAGE(offset + copysize + PAGE_SIZE - 1);

    /* map the src memory into the kernel */
    err =  umem_lockmemory((void *)start, maplen, DDI_UMEMLOCK_READ,
			   &cookie, NULL, param->proc);

    if (err) {
      MX_WARN(("mx_direct_get: unable to lock pid %d's address 0x%lx, err = %d\n",
	       param->pid, (long)(uintptr_t)usrc, err));
      break;
    }

    /* create a struct buf associated with this address range */
    bp = ddi_umem_iosetup(cookie, 0, maplen, B_READ, NODEV, 0,
			  mx_iodone, DDI_UMEM_NOSLEEP);
    if (bp == NULL) {
      MX_WARN(("mx_direct_get: ddi_umem_iosetup returns NULL\n"));
      err = ENOMEM;
      ddi_umem_unlock(cookie);
      break;
    }

    /* map it into the kernel address space */
    bp_mapin(bp);

    /* copy it to the destination */
    err = copyout(bp->b_un.b_addr + offset, (void *)(uintptr_t)udst, copysize);

    /* free resources */
    bp_mapout(bp);
    freerbuf(bp);
    ddi_umem_unlock(cookie);

    if (err != 0) {
      err = EFAULT;
      break;
    }

    /* book keeping */
    len -= copysize;
    usrc += copysize;
    udst += copysize;
  }

  return err;
}

/*
 *  Copy data quickly between two endpoints
 */

int
mx_direct_get(mx_endpt_state_t *dst_es, mx_shm_seg_t *dst_segs, uint32_t dst_nsegs,
	      mx_endpt_state_t *src_es, mx_shm_seg_t *src_segs, uint32_t src_nsegs,
	      uint32_t length)
{
  struct direct_get_callback_param param;
  int status = 0;

  /* get destination segments from current process */
  if (dst_nsegs > 1) {
    mx_uaddr_t uptr = dst_segs[0].vaddr;
    dst_segs = mx_kmalloc(dst_nsegs * sizeof(*dst_segs), 0);
    if (!dst_segs) {
      status = ENOMEM;
      goto abort_with_nothing;
    }
    status = copyin((void*) uptr, dst_segs, dst_nsegs * sizeof(*dst_segs));
    if (status) {
      goto abort_with_dst_segs;
    }
  }

  param.proc = src_es->arch.proc;
  param.pid = src_es->opener.pid;

  /* get destination segments from current process */
  if (src_nsegs > 1) {
    mx_uaddr_t uptr = src_segs[0].vaddr;
    src_segs = (mx_shm_seg_t *) mx_kmalloc(src_nsegs * sizeof(*src_segs), 0);
    if (!src_segs) {
      status = ENOMEM;
      goto abort_with_dst_segs;
    }
    status = mx_arch_copy_from_proc(uptr, param.proc, param.pid, src_segs,
                                    src_nsegs * sizeof(*src_segs));
    if (status) {
      goto abort_with_src_segs;
    }
  }

  status = mx_direct_get_common(dst_segs, dst_nsegs,
                                &param, src_segs, src_nsegs,
                                length);

 abort_with_src_segs:
  if (src_nsegs > 1)
    mx_kfree(src_segs);
 abort_with_dst_segs:
  if (dst_nsegs > 1)
    mx_kfree (dst_segs);
 abort_with_nothing:
  return status;
}


int
mx_solaris_reg_set (dev_info_t *dip, int flag, int *reg_set,
    unsigned long *busno, unsigned long *devno,
    unsigned long *funcno)
{
  /* Determine the register set containing the PCI resource we
     want to map (the memory-mappable part of the interface). We do
     this by scanning the DDI "reg" property of the interface,
     which is an array of mx_ddi_reg_set structures.  */

#define REGISTER_NUMBER(ip) (ip[0] >>  0 & 0xff)
#define FUNCTION_NUMBER(ip) (ip[0] >>  8 & 0x07)
#define DEVICE_NUMBER(ip)   (ip[0] >> 11 & 0x1f)
#define BUS_NUMBER(ip)      (ip[0] >> 16 & 0xff)
#define ADDRESS_SPACE(ip)   (ip[0] >> 24 & 0x03)
#define PCI_ADDR_HIGH(ip)   (ip[1])
#define PCI_ADDR_LOW(ip)    (ip[2])
#define PCI_SPAN_HIGH(ip)   (ip[3])
#define PCI_SPAN_LOW(ip)    (ip[4])

#define MX_DDI_REG_SET_32_BIT_MEMORY_SPACE 2
#define MX_DDI_REG_SET_64_BIT_MEMORY_SPACE 3

  int *data, i, *rs;
  u_int nelementsp;
  int instance;
  mx_instance_state_t *is = 0;
#if MX_DEBUG
  char *address_space_name[] = { "Configuration Space",
				 "I/O Space",
				 "32-bit Memory Space",
				 "64-bit Memory Space"
  };
#endif

  if (ddi_prop_lookup_int_array (DDI_DEV_T_ANY, dip,
				 DDI_PROP_DONTPASS, "reg", &data,
				 &nelementsp) != DDI_SUCCESS) {
    MX_NOTE (("Could not determine register set.\n"));
    return ENXIO;
  }

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("There are %d register sets.\n", nelementsp / 5));
  if (!nelementsp) {
    MX_NOTE (("Didn't find any \"reg\" properties.\n"));
    ddi_prop_free (data);
    return ENODEV;
  }

  /* Scan for the register number. */
  rs = &data[0];
  *busno = BUS_NUMBER(rs);
  *devno = DEVICE_NUMBER(rs);
  *funcno = FUNCTION_NUMBER(rs);

  if (flag != ENABLE_NVIDIA_ECRC) {
    instance = ddi_get_instance(dip);
    is = ddi_get_soft_state (mx_instancep, instance);
    /* return cached value, if any. */
    if (is->arch.reg_set_cached) {
      *reg_set = is->arch.reg_set;
      return 0;
    }
  }

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("*** Scanning for register number.\n"));
  for (i = 0; i < nelementsp / 5; i++) {
    rs = &data[5 * i];
    
    MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		   ("Examining register set %d:\n", i));
    MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		   ("  Register number = %d.\n", REGISTER_NUMBER (rs)));
    MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		   ("  Function number = %d.\n", FUNCTION_NUMBER (rs)));
    MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		   ("  Device number   = %d.\n", DEVICE_NUMBER (rs)));
    MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		   ("  Bus number      = %d.\n", BUS_NUMBER (rs)));
    MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		   ("  Address space   = %d (%s ).\n", ADDRESS_SPACE (rs),
		    address_space_name[ADDRESS_SPACE (rs)]));
    MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		   ("  pci address 0x%08x %08x\n", PCI_ADDR_HIGH (rs),
		    PCI_ADDR_LOW (rs)));
    MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		   ("  pci span 0x%08x %08x\n", PCI_SPAN_HIGH (rs),
		    PCI_SPAN_LOW (rs)));
    
    /* We are looking for a memory property. */

    if (ADDRESS_SPACE (rs) == MX_DDI_REG_SET_64_BIT_MEMORY_SPACE
	|| ADDRESS_SPACE (rs) == MX_DDI_REG_SET_32_BIT_MEMORY_SPACE) {
      *reg_set = i;
      if (flag != ENABLE_NVIDIA_ECRC) {
	is->arch.reg_set = i;
	is->arch.reg_set_cached = 1;
      }
      MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		     ("Memory uses register set %d.\n", *reg_set));
      
      if (flag != ENABLE_NVIDIA_ECRC) {
	is->board_span = PCI_SPAN_LOW (rs);
	is->arch.board_span_cached = 1;
      }

      MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		     ("Board span is 0x%x\n", (int) is->board_span));
      break;
    }
  }

  ddi_prop_free (data);

  /* If no match, fail. */
  if (i >= nelementsp / 5) {
    MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		   ("mx_solaris_reg_set returning EIO\n"));
    return EIO;
  }

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("mx_solaris_reg_set returning 0\n"));
  return 0;
}

void *
mx_map_io_space (mx_instance_state_t * is, uint32_t offset, uint32_t len)
{
  int reg_set, mapping_index;
  unsigned long dontcare;
  void * kaddr;

  if (mx_solaris_reg_set (is->arch.dip, 0, &reg_set, &dontcare, 
			  &dontcare, &dontcare) != 0)  /* map fail */
    return 0;   

  for (mapping_index = 0; mapping_index < MX_ARCH_MAX_MAPPINGS; mapping_index++) {
    if (is->arch.mappings[mapping_index].kaddr == 0)
      break;
  }

  if (mapping_index == MX_ARCH_MAX_MAPPINGS) {
    MX_WARN(("unable to map iospace due to lack of mapping space\n"));
    return 0;
  }

  /* Map the entire board into memory. */
  if (ddi_regs_map_setup (is->arch.dip, reg_set,   /* map fail */
			  (caddr_t *) &kaddr,
			  offset, len, &mx_dev_access_attr,
			  &is->arch.mappings[mapping_index].handle) != DDI_SUCCESS)
    return 0;

  is->arch.mappings[mapping_index].kaddr = kaddr;

  return kaddr;
}

void
mx_unmap_io_space (mx_instance_state_t * is,
		   uint32_t len, void *kaddr)
{
  int mapping_index;


  for (mapping_index = 0; mapping_index < MX_ARCH_MAX_MAPPINGS; mapping_index++) {
    if (is->arch.mappings[mapping_index].kaddr == kaddr)
      break;
  }

  if (mapping_index == MX_ARCH_MAX_MAPPINGS) {
    MX_WARN(("unable to unmap iospace, can't find addr %p\n", kaddr));
    return;
  }

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,("mx%d: unmapping kaddr 0x%p", is->id, kaddr));

  ddi_regs_map_free (&is->arch.mappings[mapping_index].handle);
  is->arch.mappings[mapping_index].kaddr = 0;
}


/*
 * Read values from /kernel/drv/mx_driver.conf
 */
static void
mx_read_conf(dev_info_t *dip)
{
#define MX_DDI_PROP_GET(name) 					   \
  do {								   \
     name = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, #name, name);} \
  while(0);

  MX_DDI_PROP_GET(mx_debug_mask);
  MX_DDI_PROP_GET(mx_max_instance);
  MX_DDI_PROP_GET(mx_max_nodes);
  MX_DDI_PROP_GET(mx_max_endpoints);
  MX_DDI_PROP_GET(mx_max_send_handles);
  MX_DDI_PROP_GET(mx_small_message_threshold);
  MX_DDI_PROP_GET(mx_medium_message_threshold);
  MX_DDI_PROP_GET(mx_security_disabled);
  MX_DDI_PROP_GET(mx_intr_coal_delay);
  MX_DDI_PROP_GET(mx_override_e_to_f);
  mx_nvidia_ecrc_enable =
    ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, "mx_nvidia_ecrc_enable", 1);
    
}

#if #cpu(i386) || defined __i386 || defined i386 ||	\
	defined __i386__ || #cpu(x86_64) || defined __x86_64__

static int
mx_get_parent_id(dev_info_t *dip, uint16_t *vendor_id, uint16_t *device_id)
{
  ddi_acc_handle_t handle;
  dev_info_t 	*parent_dip;
  int instance;

  instance = ddi_get_instance(dip);
  
  parent_dip = ddi_get_parent(dip);
  if (parent_dip == NULL) {
    cmn_err(CE_WARN, "mx%d: I'm an orphan?", instance);
    return (DDI_FAILURE);
  }
  if (pci_config_setup(parent_dip, &handle) != DDI_SUCCESS) {
    cmn_err(CE_WARN,
	    "mx%d: Could not access my parent's registers", instance);
    return (DDI_FAILURE);
  }

  *vendor_id = pci_config_get16(handle, PCI_CONF_VENID);
  *device_id = pci_config_get16(handle, PCI_CONF_DEVID);
  pci_config_teardown(&handle);

  return (DDI_SUCCESS);
}

#include <vm/hat.h>
void *device_arena_alloc(size_t size, int vm_flag);
void device_arena_free(void *vaddr, size_t size);
static int
mx_enable_nvidia_ecrc(dev_info_t *dip)
{
  dev_info_t *parent_dip;
  int dontcare;
  unsigned long bus_number, dev_number, func_number;
  unsigned long paddr, base, pgoffset;
  char 		*cvaddr, *ptr;
  uint32_t	*ptr32;
  int 		retval = DDI_FAILURE;
  uint16_t        vendor_id, device_id;
  int             instance;
  
  instance = ddi_get_instance(dip);

  parent_dip = ddi_get_parent(dip);
  if (parent_dip == NULL) {
    cmn_err(CE_WARN, "mx%d: I'm an orphan?", instance);
    return (DDI_FAILURE);
  }

  mx_get_parent_id(dip, &vendor_id, &device_id);

  if (vendor_id != 0x10de || device_id != 0x005d) {
    return (DDI_FAILURE);
  }
  mx_solaris_reg_set(parent_dip, ENABLE_NVIDIA_ECRC, &dontcare, 
		     &bus_number, &dev_number, &func_number);
  
  /* find the config space address for the nf4 bridge */
  paddr = (0xe0000000UL + bus_number * 0x00100000UL +
	   (dev_number * 8 + func_number) * 0x00001000UL);
  
  base = paddr & (~MMU_PAGEOFFSET);
  pgoffset = paddr & MMU_PAGEOFFSET;
  
  /* map it into the kernel */
  cvaddr =  device_arena_alloc(ptob(1), VM_NOSLEEP);
  if (cvaddr == NULL)
    cmn_err(CE_WARN, "mx%d: failed to map nf4: cvaddr\n", instance);
  
  hat_devload(kas.a_hat, cvaddr, mmu_ptob(1), mmu_btop(base),
	      PROT_WRITE|HAT_STRICTORDER, HAT_LOAD_LOCK);
  
  ptr = cvaddr + pgoffset;
  vendor_id = *(uint16_t *)(ptr + PCI_CONF_VENID);
  device_id = *(uint16_t *)(ptr + PCI_CONF_DEVID);
  if (vendor_id != 0x10de || device_id != 0x005d) {
    cmn_err(CE_NOTE, "mx%d: Could not map Nvidia bridge\n", instance);
    goto abort_with_mapped;
  }

  ptr32 = (uint32_t *)(ptr + 0x178);
  cmn_err(CE_NOTE, "Enabling ECRC on upstream Nvidia bridge "
	  "at %ld:%ld:%ld\n", bus_number, dev_number, func_number);
  *ptr32 |= 0x40;
  retval = DDI_SUCCESS;

abort_with_mapped:
  hat_unload(kas.a_hat, cvaddr, ptob(1), HAT_UNLOAD_UNLOCK);
  device_arena_free(cvaddr, ptob(1));
  return (retval);

}

#else
static int
mx_enable_nvidia_ecrc(dev_info_t *dip)
{
  return (DDI_FAILURE);
}
#endif /* i386 */


/************************************************************************
 * Module entry points    (called via dev_ops structure)
 ************************************************************************/
static int 
mx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
  int instance, err;
  minor_t minor;
  mx_instance_state_t *is;
  static int read_conf = 0;
  int create_mxctl = 0;

  if (cmd != DDI_ATTACH)
    return DDI_FAILURE;

  mutex_enter (&mx_driver_mutex);
  if (!read_conf) {
    mx_read_conf(dip);
    create_mxctl = 1;
    read_conf = 1;
  }

  if (mx_num_instances >= mx_max_instance)
    goto abort_with_driver_mutex;


  if (mx_driver_version[0] == 0) {
    (void) strcpy (mx_driver_version, MX_DRIVER_VERSION);   
    (void) strncpy (mx_driver_version + sizeof (MX_DRIVER_VERSION) - 1, 
		    MX_VERSION_STR, 40 - sizeof(MX_DRIVER_VERSION) - 1);
    mx_driver_version[39] = '\0';
  }

  instance = ddi_get_instance(dip);

  /* Verify that board can be DMA master. */
  if (ddi_slaveonly (dip) == DDI_SUCCESS) {
    MX_INFO (("MX board #%d in slave-only slot cannot be used.",
	      instance));
    goto abort_with_driver_mutex;
  }

  /* Allocate the "instance state" for the interface (as Solaris soft state) */

  if (ddi_soft_state_zalloc (mx_instancep, instance) != DDI_SUCCESS) {
    MX_INFO (("MX myri[%d]: soft state alloc failed", instance));
    goto abort_with_driver_mutex;
  }
  is = ddi_get_soft_state (mx_instancep, instance);

  /* Add the interrupt handler now that we know what IS is.  This will
     set is->arch.iblock_cookie so that we can properly initialize any
     mutex that depends on it.

     HACK: The interrupt handler will crash the machine if it called
     before the instance state is initialized.  We install it here
     anyway to get the iblock cookie, and then enable interrupts only
     after the instance is initialized. */

  /* Verify that high-level interrupts are supported */

  if (ddi_intr_hilevel (dip, 0) != 0) {
    MX_INFO (("High level interrupts not supported"));
    goto abort_with_instance_state;
  }

  ddi_get_iblock_cookie (dip, 0, &is->arch.iblock_cookie);

  is->id = instance;
  is->arch.dip = dip;
#if MX_SOLARIS_COMPLICATED_SPIN
  mx_sync_init (&is->arch.spin_sync, is);
#endif

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("[%d]: name matched (%s)\n", instance, ddi_get_name (dip)));

  /* Setup access to PCI configurations space before calling
     mx_instance_init(). */

  /* Enable access to PCI configuration space */
  if (pci_config_setup (dip, &is->arch.pci_acc_handle) != DDI_SUCCESS) {
    MX_INFO (("Could not setup access to PCI config space.\n"));
    goto abort_with_spin_sync;
  }


  if (ddi_add_intr (dip, 0, &is->arch.iblock_cookie, 0,
      	    (u_int (*)(caddr_t)) mx_solaris_intr, (void *) is) != 0) {
    MX_INFO (("cannot add interrupt handler."));
    goto abort_with_initialized_config_state;
  }

  if (mx_nvidia_ecrc_enable)
    mx_enable_nvidia_ecrc(dip);

  /* Initialize the interface, including mapping the board regions,
     copying eeprom, and starting the LANai control program (with
     interrupts enabled.) */

  if (mx_instance_init (is, instance) != 0) {
    MX_INFO (("[%d]: Could not initialize instance.\n", instance));
    goto abort_with_intr;
  }
  mx_mutex_exit(&is->sync);

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("mx_instance_init instance=%d completed successfully\n",
		  instance));
  mx_instance_ok[instance]++;


  /*******************************************************************
   * Create devices. /dev/mx%d is the clone device for normal users, *
   * and /dev/mxp%d is the clone device for privileged users.        *
   *******************************************************************/

  minor = instance * 2;
  mutex_enter (&mx_minor_mutex);
  mx_minors[minor].is = is;
  mx_minors[minor].es = NULL;
  mx_minors[minor].privileged = 0;
  mx_minors[minor + 1].is = is;
  mx_minors[minor + 1].es = NULL;
  mx_minors[minor + 1].privileged = 1;
  mutex_exit (&mx_minor_mutex);
  
  /* Create the minor node (/dev/mx?) */
  {
    char name[20];
    
    /* create /dev/mx? clone device. */
    sprintf (name, "mx%d", instance);
    if (ddi_create_minor_node (dip, name, S_IFCHR, minor,
			       DDI_PSEUDO, 0) != DDI_SUCCESS) {
      MX_INFO (("ddi_create_minor_node failed for unit mx%d.", instance));
      goto abort_with_mx_minor_table;
    }

    /* create /dev/mxp* privileged clone device. */
    sprintf (name, "mxp%d", instance);
    if (ddi_create_minor_node (dip, name, S_IFCHR, minor + 1,
			       DDI_PSEUDO, 0) != DDI_SUCCESS) {
      MX_INFO (("ddi_create_minor_node failed for unit mxp%d.", instance));
      goto abort_with_clone_minor_node;
    }
  }

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("created minor nodes\n"));

  /* Put pointer to private info into dip. */
  ddi_set_driver_private (dip, (caddr_t) is);

  /* Report attachment */
  ddi_report_dev (dip);

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("Reported attachment. Done.\n"));

  if (create_mxctl == 1)
    {
	err = ddi_create_minor_node(dip, "mxctl", S_IFCHR, MX_CTL,
				       DDI_PSEUDO, 0);
	err |= ddi_create_minor_node(dip, "mxctlp", S_IFCHR, MX_CTLP,
				       DDI_PSEUDO, 0);
	if (err != 0)
	  MX_WARN(("Failed to create CTL minors, \n", err));
    }

  mutex_exit (&mx_driver_mutex);
  return DDI_SUCCESS;

  /******* Error handling *******/

abort_with_clone_minor_node:
  ddi_remove_minor_node (dip, NULL);
abort_with_mx_minor_table:
  mx_minor_free(minor + 1);
  mx_minor_free(minor);


  mx_instance_finalize (is);

 abort_with_intr:
  ddi_remove_intr (dip, 0, is->arch.iblock_cookie);
abort_with_initialized_config_state:
  pci_config_teardown (&is->arch.pci_acc_handle);
abort_with_spin_sync:
#if MX_SOLARIS_COMPLICATED_SPIN
  mx_sync_destroy (&is->arch.spin_sync);
#endif
abort_with_instance_state:
  ddi_soft_state_free (mx_instancep, instance);
abort_with_driver_mutex:
  mutex_exit (&mx_driver_mutex);
  return DDI_FAILURE;
}

/* Unattach everything in reverse order */
static int
mx_detach (dev_info_t * dip, ddi_detach_cmd_t cmd)
{
  mx_instance_state_t *is;
  int instance;

  if (cmd != DDI_DETACH)
    return (DDI_FAILURE);

  mutex_enter (&mx_driver_mutex);
  instance = ddi_get_instance (dip);
  is = ddi_get_soft_state (mx_instancep, instance);

  ddi_remove_minor_node (dip, NULL);

  mutex_enter (&mx_minor_mutex);
  bzero ((void *)&mx_minors[instance * 2 + 1], sizeof(mx_minor_state_t));
  bzero ((void *)&mx_minors[instance * 2], sizeof(mx_minor_state_t));
  mutex_exit (&mx_minor_mutex);

  if (is != NULL && is->board_ops.disable_interrupt != NULL)
    is->board_ops.disable_interrupt(is);

  ddi_remove_intr (dip, 0, is->arch.iblock_cookie);
  mx_instance_finalize (is);
  pci_config_teardown (&is->arch.pci_acc_handle);
#if MX_SOLARIS_COMPLICATED_SPIN
  mx_sync_destroy (&is->arch.spin_sync);
#endif
  ddi_soft_state_free (mx_instancep, instance);
  mx_instance_ok[instance] = 0;

  mutex_exit (&mx_driver_mutex);
  return DDI_SUCCESS;
}
 
/* Perform minor device number to device instance mapping for the
   kernel. */

static int
mx_getinfo (dev_info_t * dip, ddi_info_cmd_t infocmd, void *arg,
	    void **result)
{
  mx_instance_state_t *is;
  dev_t dev;
  int status = DDI_FAILURE;

  switch (infocmd) {
  case DDI_INFO_DEVT2INSTANCE:
    mutex_enter (&mx_driver_mutex);
    dev = (dev_t) arg;
    is = mx_minors[getminor(dev)].is;
    if (!is)
      goto abort_with_global_mutex;
    *result = (void *)(uintptr_t) is->id;
    mutex_exit (&mx_driver_mutex);
    return DDI_SUCCESS;
    
  case DDI_INFO_DEVT2DEVINFO:
    mutex_enter (&mx_driver_mutex);
    dev = (dev_t) arg;
    is = mx_minors[getminor(dev)].is;
    if (!is)
      goto abort_with_global_mutex;
    *result = (void *) is->arch.dip;
    mutex_exit (&mx_driver_mutex);
    return DDI_SUCCESS;
  }

abort_with_global_mutex:
  mutex_exit (&mx_driver_mutex);
  return status;
}

/* open a device.  For MX, this is a clone device, and we don't do
   much initialization here, other than to reserve a new minor number
   (by setting mx_minors[minor_new].is to non-NULL).  Later, a
   MX_SET_ENDPOINT ioctl will allocate the mx_endpt_state and allow it
   to be fully initialized. The index of the first non-zero
   mx_minors[i] is the new minor number.  es->is.id records the board
   number, es->endpt records the endpt number. */
 
static int
mx_open (dev_t *devp, int flags, int otyp, cred_t * credp)
{
  dev_t dev;
  int minor_old, minor_new, status;
  mx_instance_state_t *is;

  if (otyp != OTYP_CHR) {
    MX_INFO (("mx_open() called with bad otyp.\n"));
    status = EINVAL;
    goto abort_with_nothing;
  }

  minor_old = (int)getminor (*devp);
  if (minor_old == MX_CTL || minor_old == MX_CTLP) {
    return 0;
  }
  if (minor_old >= mx_max_instance * 2)
    return ENODEV;

  is = mx_instances[minor_old/2];
  if (is == NULL)
    return ENODEV;

  mutex_enter(&mx_driver_mutex);

  /* Determine an unused minor number for the opened device */
  status = mx_minor_alloc(is, &minor_new);
  if (status != 0) {
    MX_NOTE (("Could not create new minor number, mx_minor_alloc returns %d.\n",
	      status));
    goto abort_with_mutex;
  }
  mx_minors[minor_new].privileged = minor_old & 1;

  /* Signal OS to make new device with the same major number and a
     new minor number. */
  dev = makedevice (getmajor (*devp), minor_new);
  if (dev == NODEV) {
    MX_NOTE (("Could not make device number.\n"));
    status = ENOMEM;
    goto abort_with_minor;
  }
  *devp = dev;

  /* Success, so commit */

  MX_DEBUG_PRINT(MX_DEBUG_OPENCLOSE,
		 ("User opened device with minor number 0x%x.\n", 
		  getminor (dev)));
  mutex_exit (&mx_driver_mutex);
  return 0;

 abort_with_minor:
  mx_minor_free(minor_new);
 abort_with_mutex:
  mutex_exit(&mx_driver_mutex);
 abort_with_nothing:
  return status;
}


static int
mx_close (dev_t dev, int flags, int otyp, cred_t * credp)
{
  mx_endpt_state_t *es;
  int minor;
  
  minor = (int)getminor(dev);
  
  if (minor == MX_CTL || minor == MX_CTLP)
    return 0;

  if (minor >= mx_max_minor)
    return ENODEV;

  mutex_enter (&mx_driver_mutex);
  es = mx_minors[minor].es;
  mx_minor_free(minor);
  if (es) {
    mx_common_close(es);
    mx_kfree(es);
  }  

  mutex_exit(&mx_driver_mutex);
  return 0;
}

static int
mx_set_endpoint(mx_instance_state_t *is, mx_uaddr_t arg, int raw, int minor)
{  
  int status;
  mx_set_endpt_t set_endpt;
  mx_endpt_state_t *es;
  rval_t rpid;

  if (is == NULL)
    return EINVAL;

  if (!raw) {
    status = mx_arch_copyin(arg, &set_endpt, sizeof(set_endpt));
    if (status)
      return EFAULT;
    if (set_endpt.endpoint < 0 || set_endpt.endpoint >= mx_max_endpoints)
      return ERANGE;
  }

  es = mx_kmalloc(sizeof(mx_endpt_state_t), MX_MZERO|MX_WAITOK);
  if (es == 0)
    return ENOMEM;  

  es->privileged = mx_minors[minor].privileged;
  rpid.r_vals = getpid();
  es->opener.pid = rpid.r_val1;
  es->arch.proc = curproc;
  es->is_kernel = 0;
  
  /* is->id is the board number */
  status = mx_common_open(is->id, set_endpt.endpoint, es, raw);
  if (status != 0) {
    mx_kfree(es);
    return status;
  }
  set_endpt.session_id = es->session_id;
  
  if (!raw) 
    status = mx_arch_copyout(&set_endpt, arg, sizeof(set_endpt));
 
  mx_minors[minor].es = es;

  return status;
}


static int
mx_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t * cred_p, 
	 int *rval_p)
{
  unsigned int minor;
  int retval, privileged;
  mx_endpt_state_t *es;

  /* Make sure ioctl is not being called by the kernel, 
     since we cannot handle that yet. */
  if (mode & FKIOCTL)
    return ENOTSUP;

  minor = (unsigned int)getminor(dev);

  if (minor >= mx_max_minor) {
    if (minor == MX_CTLP || minor == MX_CTL) {
      return mx_endptless_ioctl(cmd, arg, minor & 1, 0);
    } else
      return ERANGE;
  }


  es = mx_minors[minor].es;
  privileged = mx_minors[minor].privileged;

  /* Some ioctls do not require a valid endpoint state */
  if (es == NULL) {
    switch (cmd) {
    case MX_SET_ENDPOINT:
      retval = mx_set_endpoint(mx_minors[minor].is, (mx_uaddr_t)arg, 0, minor);
      break;
    case MX_SET_RAW:
      retval = mx_set_endpoint(mx_minors[minor].is, (mx_uaddr_t)arg, 1, minor);
      break;
    default:
      retval = mx_endptless_ioctl(cmd, arg, privileged, 0);
      break;
    }
    goto done;
  }

  /* others do.. */

  mx_mutex_enter(&es->sync);
  es->ref_count++;
  mx_mutex_exit(&es->sync);
  
  retval = mx_common_ioctl(es, cmd, arg);
  if (retval == ENOTTY) 
    retval = mx_endptless_ioctl(cmd, arg, privileged, 0);
  
  mx_mutex_enter(&es->sync);
  es->ref_count--;
  mx_mutex_exit(&es->sync);  
    
 done:
  return retval;
}

static int 
mx_read(dev_t dev, struct uio *uio, cred_t *credp)
{
  unsigned int minor;
  int len;
  int resid;
  int status = 0;
  off_t offset;
  char *c;

  minor = (unsigned int)getminor(dev);

  if (minor >= mx_max_minor) {
    return EINVAL;
  }  
  
  status = mx_instance_status_string(mx_minors[minor].is->id, &c, &len);
  if (status)
    return status;

  resid = uio->uio_resid;
  offset = uio->uio_offset;
  if (offset > strlen(c))
      goto abort_with_c;
  status = uiomove(c+offset, MIN((len - offset), uio->uio_resid), 
		   UIO_READ, uio);

 abort_with_c:
  mx_kfree(c);
  return status;
}

/* Map interface memory into the process.  This can be used to map the
   LANai control register, special registers, SRAM, or copy block into
   the user space.  The offsets and sizes of these various regions can
   be obtained using the MX_GET_MAPPING_SPECS ioctl, which returns
   a description of the mappings in a "mx_mapping_specs"
   structure.

   The pages of the copy block must be mapped in order and without
   skipping any offsets.

   mx_prepare_to_mmap ensures that all offsets passed to mx_mmap are
   legitimate, so there is no need to range check offsets here.

   We map the memory using devmap_devmem_setup for device memory,
   and devmap_umem_setup for copyblock memory.

*/

static int 
mx_devmap (dev_t dev, devmap_cookie_t dhp, offset_t off, 
	   size_t len, size_t *maplen, uint_t model)
{
  mx_instance_state_t *is;
  mx_endpt_state_t *es;
  offset_t roff, koff;
  void *kva;
  mx_page_pin_t *pin;
  int reg_set, status, mem_type = -1;
  uint_t prot = PROT_USER|PROT_READ|PROT_WRITE;
  unsigned int minor = getminor(dev);
  unsigned long dontcare;

  if (minor >= mx_max_minor)
    return EINVAL;

  mutex_enter (&mx_driver_mutex);
  es = mx_minors[minor].es;

  if (!es) {
    MX_PRINT (("es NULL"));
    status = EINVAL;
    goto abort_with_driver_mutex;
  }

  mx_assert (es);
  mx_mutex_enter (&es->sync);

  /* determine the kva for this request */

  status = mx_mmap_off_to_kva(es, off, &kva, &mem_type, &pin);
  if (status != 0) {
    MX_DEBUG_PRINT (MX_DEBUG_KVA_TO_PHYS,
		    ("status = %d, len = 0x%lx\n", status, (long)len));
    goto abort_with_es_mutex;
  }
  mx_assert (kva);

  is = es->is;
  mx_assert (is);

  status = mx_solaris_reg_set(is->arch.dip, 0, &reg_set, 
			      &dontcare, &dontcare, &dontcare);
  if (status)
    goto abort_with_es_mutex;

  switch (mem_type) {
    /* if the mapping is in io space, we find out how far into
       the mapping it is, and then use devmap_devmem_setup()
       to map it */
  case MX_MEM_SRAM:
    roff = (char *)kva - (char *)is->lanai.sram;
    status = devmap_devmem_setup (dhp, is->arch.dip, NULL,
				 reg_set, roff, len, prot, 0, 
				 &mx_dev_access_attr);
    break;
  case MX_MEM_CONTROL:
    roff = (char *)kva - (char *)is->lanai.control_regs;
    status = devmap_devmem_setup (dhp, is->arch.dip, NULL,
				 reg_set, roff, len, prot, 0, 
				 &mx_dev_access_attr);
    break;
  case MX_MEM_SPECIAL:
    roff = (char *)kva - (char *)is->lanai.special_regs;
    status = devmap_devmem_setup (dhp, is->arch.dip, NULL,
				 reg_set, roff, len, prot, 0, 
				 &mx_dev_access_attr);
    break;

  case MX_MEM_HOSTMEM:
    /* it is a kernel address so we are mapping out portions of the
       copyblock and we need to find the umem cookie corresponding to
       that copyblock segment, as well as the offset into the
       segment */
    koff = 0;  /* offset should always be zero, since
		  allocation is done by PAGE_SIZE */
    *maplen = len = PAGE_SIZE;
    status = devmap_umem_setup(dhp, is->arch.dip, NULL,
			       pin->umem_cookie,
			       koff, len, prot, 0,
			       &mx_dev_access_attr);
    break;
  }

  if (status !=0)
    MX_PRINT (("umem/devmem_setup status = %d, type = %d\n",status, mem_type));

abort_with_es_mutex:
  mx_mutex_exit (&es->sync);
abort_with_driver_mutex:
  mutex_exit (&mx_driver_mutex);  
  if (status == 0)
    *maplen = len;
  else
    *maplen = 0;

  return status;
}

static int 
mx_identify (dev_info_t * dip)
{
  char *name;

  name = ddi_get_name (dip);

  MX_INFO (("identifying name \"%s\"\n", name));

#if MX_2G_ENABLED
  if (strncmp (name, "pci14c1,8043", 12) == 0
      || strncmp (name, "pci14c1,8044", 12) == 0
      || strncmp (name, "pci14c1,0", 9) == 0)
#endif
#if MX_10G_ENABLED
  if (strncmp (name, "pci14c1,8", 9) == 0
      || strncmp (name, "pciex14c1,8", 11) == 0
      || strncmp (name, "ethernet", 8) == 0)
#endif
    return (0);    /* DDI_IDENTIFIED */
  else
    return (-1);   /* DDI_NOT_IDENTIFIED */
}

/************************************************************************
 * Dynamic loading entry points.
 ************************************************************************/

int
_init (void)
{
  int error;

  mutex_init (&mx_driver_mutex, "MX driver mu", MUTEX_DRIVER, NULL);
  mutex_init (&mx_minor_mutex, "MX solaris driver minor mu", MUTEX_DRIVER, NULL);
  /* preallocate state for 1 MX card */
  error
    = ddi_soft_state_init (&mx_instancep, sizeof (mx_instance_state_t), 1);
  if (error != 0) {
    MX_WARN(("ddi_soft_state_init returned %d\n", error));
    goto abort_with_mutex_init;
  }

  mx_max_minor = mx_max_instance * 2 + mx_max_instance * mx_max_endpoints * 2;

  if (mx_max_minor >= MX_CTL) {
    MX_NOTE(("mx_max_minor collides with MX_CTL\n"));
    goto abort_with_device_state;
  }

  mx_minors = (mx_minor_state_t *)
      mx_kmalloc(mx_max_minor * sizeof(mx_minor_state_t), MX_MZERO);
  if (!mx_minors)
    goto abort_with_device_state;

  mx_instance_ok = (unsigned int *)
    mx_kmalloc(mx_max_instance * sizeof(*mx_instance_ok), MX_MZERO);
  if (!mx_instance_ok)
    goto abort_with_minors;

  /* initialize the MI portions of MX */
  error = mx_init_driver();
  if (error != 0) {
    MX_WARN(("mx_init_driver returned %d\n", error));
    goto abort_with_mx_instance_ok;
  }

  error = mod_install (&modlinkage);
  if (error != 0) {
    MX_WARN(("mod_install returned %d\n", error));
    goto abort_with_init_driver;
  }

  return DDI_SUCCESS;

abort_with_init_driver:
  (void) mx_finalize_driver();
abort_with_mx_instance_ok:
  mx_kfree(mx_instance_ok);
abort_with_minors:
  mx_kfree(mx_minors);
abort_with_device_state:
  ddi_soft_state_fini (&mx_instancep);
abort_with_mutex_init:
  mutex_destroy (&mx_minor_mutex);
  mutex_destroy (&mx_driver_mutex);
  MX_INFO (("mx device driver initialization failed."));
  return error;
}

int
_fini (void)
{
  int error;

  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("finalizing driver ()\n"));
  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("Calling mod_remove()\n"));
  error = mod_remove (&modlinkage);
  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("Called mod_remove()\n"));
  if (error)
    {
      MX_NOTE(("error calling mod_remove.\n"));
      return (error);
    }
  (void) mx_finalize_driver();
  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("Calling mx_kfree()\n"));
  mx_kfree(mx_instance_ok);
  mx_kfree(mx_minors);
  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("Calling ddi_soft_state_fini()\n"));
  mx_assert (mx_instancep);
  ddi_soft_state_fini (&mx_instancep);
  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("Called ddi_soft_state_fini()\n"));

  /* Don't use "RETURN" macro, since it might print something after the
     print mutex has been destroyed. */
  MX_DEBUG_PRINT(MX_DEBUG_BOARD_INIT,
		 ("_fini  returning\n"));
  mutex_destroy (&mx_minor_mutex);
  mutex_destroy (&mx_driver_mutex);
  MX_INFO(("MX driver finalized\n"));
  return 0;
}

int
_info (struct modinfo *modinfop)
{
  return (mod_info (&modlinkage, modinfop));
}

int
mx_minor_alloc(mx_instance_state_t *is, int *minor_p)
{
  int minor, status = 0;

  mutex_enter (&mx_minor_mutex);
  for (minor = mx_max_instance * 2; minor < mx_max_minor; minor ++) {
    if (mx_minors[minor].is == 0)
      break;
  }
   
  if (minor >= mx_max_minor) {
    status = ENODEV;
    goto abort_with_sync;
  }

  /* report success */
  mx_minors[minor].is = is;
  *minor_p = minor;

 abort_with_sync:
  mutex_exit(&mx_minor_mutex);
  return status;
}

/************
 * free a previously allocated minor number.
 ************/
void 
mx_minor_free(int minor)
{
  if (minor >= mx_max_minor) {
    MX_NOTE(("Attempt to free a bad minor (%d), max = %d\n",
	      minor, mx_max_minor));
    mx_assert (0);
  }

  /* setting mx_minors[minor] to 0 */
  mutex_enter (&mx_minor_mutex);
  bzero ((void *)&mx_minors[minor], sizeof(mx_minor_state_t));
  mutex_exit (&mx_minor_mutex);

  return;
}

int
mx_alloc_dma_page(mx_instance_state_t *is, char **alloc_addr, 
		  char **addr, mx_page_pin_t *pin)
{
  /* previous gm_arch_dma_region_alloc */
  int status=0;
  int ddi_flags;
  unsigned int dma_cookie_cnt;

  mx_dma_attr.dma_attr_align = PAGE_SIZE;
  mx_dma_attr.dma_attr_granular = PAGE_SIZE;
  status = ddi_dma_alloc_handle(is->arch.dip, &mx_dma_attr, DDI_DMA_DONTWAIT,
				NULL, &pin->dma_handle);

  if (status != DDI_SUCCESS) {
    MX_NOTE (("Could not allocate DMA handle; status = %d.", status));
    if (status == DDI_DMA_BADATTR) {
      MX_NOTE (("Bad DMA attributes."));
      status = EINVAL;
    } else {
      MX_PRINT (("could not alloc DMA hanlde"));
      status = ENOSPC;  /* should be some thing else */
    }
    goto abort_with_nothing;
  }

  /* karen: should set ddi_flags correctly */
  ddi_flags =  DDI_DMA_READ | DDI_DMA_WRITE | DDI_DMA_CONSISTENT;
  
  /* ddi_umem_alloc allocates page-aligned kernel memory,
   * which by default, is locked. */
  *alloc_addr = *addr =
    ddi_umem_alloc(PAGE_SIZE, DDI_UMEM_NOSLEEP, &pin->umem_cookie);
  if (*addr == NULL) {
    MX_NOTE (("Could not allocate DMAable memory.\n"));
    status = ENOMEM;
    goto abort_with_dma_handle;
  }

  status = ddi_dma_addr_bind_handle (pin->dma_handle, NULL, 
				     (caddr_t)(*alloc_addr),
				     PAGE_SIZE,
				     ddi_flags,
				     DDI_DMA_DONTWAIT, NULL,
				     &pin->dma_cookie, &dma_cookie_cnt);
  if (status != DDI_SUCCESS) {
    MX_NOTE (("Could not bind dma handle: status = %d", status));
    status = MX_UNTRANSLATED_SYSTEM_ERROR;  /*find out the return value later*/
    goto abort_with_umem;
  }
  
  mx_assert (dma_cookie_cnt == 1);
  mx_assert (pin->dma_cookie.dmac_size == PAGE_SIZE);

  /* Compute the DMA status bits for the dma region. */
  pin->sts = ddi_dma_burstsizes (pin->dma_handle) >> 3 & 0x1ff;

  /* Verify computed PCI burstsize is legit. */
  if (pin->sts != 0xf) {
    MX_NOTE (("PCI burst size not computed correctly.\n"));
    status = MX_INTERNAL_ERROR;  /*find out the return value later*/
    goto abort_with_bound_handle;
  }

  if (ddi_dma_sync (pin->dma_handle, 0, PAGE_SIZE, DDI_DMA_SYNC_FORDEV)
      != DDI_SUCCESS) {
    MX_NOTE (("Could not sync DMA region for device\n"));
    status = MX_INTERNAL_ERROR;
    goto abort_with_bound_handle;
  }
  
  if (ddi_dma_sync (pin->dma_handle, 0, PAGE_SIZE, DDI_DMA_SYNC_FORCPU)
      != DDI_SUCCESS) {
    MX_NOTE (("Could not sync DMA region for CPU\n"));
    status = MX_INTERNAL_ERROR;
    goto abort_with_bound_handle;
  }

  pin->va = (uint64_t)(unsigned long)*addr;
  pin->dma.high = (uint64_t) pin->dma_cookie.dmac_laddress >> 32;
  pin->dma.low = pin->dma_cookie.dmac_laddress;

  return 0;

 abort_with_bound_handle:
  ddi_dma_unbind_handle (pin->dma_handle);
 abort_with_umem:
  ddi_umem_free (pin->umem_cookie);
 abort_with_dma_handle:
  ddi_dma_free_handle (&pin->dma_handle);
 abort_with_nothing:
  return status;
}

void
mx_free_dma_page(mx_instance_state_t *is, char **alloc_addr, mx_page_pin_t *pin)
{
  ddi_dma_unbind_handle(pin->dma_handle);
  ddi_umem_free(pin->umem_cookie);
  ddi_dma_free_handle(&pin->dma_handle);
  bzero((void *)pin, sizeof(mx_page_pin_t));
  if (MX_DEBUG)
    *alloc_addr = 0;
}


void
mx_optimized_free_copyblock(mx_instance_state_t *is, mx_copyblock_t *cb)
{
  char *va;
  int i;

  if (cb->pins == NULL)
    return;

  for (i = 0; i < (cb->size / PAGE_SIZE); i++) {
    va = (char *)(unsigned long)cb->pins[i].va;
    if (va != 0) 
      mx_free_dma_page(is, &va,  &cb->pins[i]);
  }

  mx_kfree(cb->pins);
  cb->pins = NULL;
}

int
mx_optimized_alloc_copyblock(mx_instance_state_t *is, mx_copyblock_t *cb)
{
  char *va;
  int i, status;

  cb->pins = mx_kmalloc(sizeof(cb->pins[0]) * (cb->size / PAGE_SIZE), 
			MX_MZERO|MX_WAITOK);
  if (cb->pins == NULL) {
    MX_WARN(("copyblock pin info allocation failed due to lack of memory\n"));
    status = ENOMEM;
    goto abort_with_nothing;
  }
 
  for (i = 0; i * PAGE_SIZE < cb->size; i++) {
    status = mx_alloc_zeroed_dma_page(is, &va, &va, 
				      &cb->pins[i]);

    if (status)
      goto abort_with_dma_pages;
  }

  return 0;

 abort_with_dma_pages:
  mx_optimized_free_copyblock(is, cb);

 abort_with_nothing:
  return status;

}

void
mx_set_default_hostname(void)
{
  strncpy(mx_default_hostname, utsname.nodename, sizeof(mx_default_hostname) - 1);
  mx_default_hostname[sizeof(mx_default_hostname) - 1] = '\0';
}

int
mx_rand(void)
{
  int ret;
  
  random_get_bytes((uint8_t *)&ret, sizeof(ret));
  return ret;
}

void 
mx_assertion_failed (const char *assertion, int line, const char *file)
{
  panic("MX: assertion: <<%s>>  failed at line %d, file %s\n",
	 assertion, line, file);
}


int
mx_start_mapper(mx_instance_state_t *is)
{
  return ENOTSUP;
}

int
mx_stop_mapper(mx_instance_state_t *is)
{
  return ENOTSUP;
}


/* Printing goop from GM */
void
mx_solaris_info(char *format, ...)
{
  va_list ap;
  char buf[256] = {"INFO: "};

  va_start(ap, format);
  vsnprintf(buf + strlen(buf), sizeof(buf) - strlen(buf) - 1,
	   format, ap);
  va_end(ap);
  cmn_err(CE_CONT, buf);
}
void
mx_solaris_warn(char *format, ...)
{
  va_list ap;
  char buf[256] = {"WARN: "};

  va_start(ap, format);
  vsnprintf(buf + strlen(buf), sizeof(buf) - strlen(buf) - 1,
	   format, ap);
  va_end(ap);
  cmn_err(CE_CONT, buf);
}

void
mx_ether_tx_done(mx_instance_state_t *is, uint32_t mcp_index)
{
  is->arch.ether_tx_done((struct mx_instance_state *)is, mcp_index);
}

void
mx_ether_rx_done_small(mx_instance_state_t *is, int count, int len, 
		       int csum, int flags)
{
  is->arch.ether_rx_done_small((struct mx_instance_state *)is, count, len, 
			       csum, flags);
}

void
mx_ether_rx_done_big(mx_instance_state_t *is, int count, int len, 
		     int csum, int flags)
{
  is->arch.ether_rx_done_big((struct mx_instance_state *)is, count, len, 
			     csum, flags);
}

void
mx_ether_link_change_notify(mx_instance_state_t *is)
{
  if (is->arch.ether_link_change != NULL)
    is->arch.ether_link_change(is);
}

int
mx_ether_parity_detach(mx_instance_state_t *is)
{
  MX_WARN(("mx_ether_parity_detach called!\n"));
  return 0;
}

void
mx_ether_parity_reattach(mx_instance_state_t *is)
{
  MX_WARN(("mx_ether_parity_reattach called!\n"));
}
